############################################################


#Replication files for:

#Introducing COMEPELDA: 
#Comprehensive European Parliament electoral data covering
#rules, parties and candidates.
#European Union Politics.
# https://doi.org/10.1177%2F14651165211053439

#by:
#Thomas Däubler (thomas.daubler@ucd.ie)
#Mihail Chiru
#Silje Synnøve Lyder Hermansen
############################################################


# R version 4.0.4

library(ggplot2) # v 3.3.5
library(readstata13) # v 0.10.0
library(dplyr) # v 1.0.5
library(countrycode) # v 1.2.0
library(eurostat) # v 3.7.5
library(colorspace) # v 2.0.0

# the COMEPELDA data are available from https://doi.org/10.7910/DVN/GNRMTO

da <- read.csv(file="COMEPELDA_aggregate_v1.00.csv",
               header=T, encoding = "UTF-8", na.strings=".")
dc <- read.csv(file="COMEPELDA_candidates_v1.00.csv",
               header=T, encoding = "UTF-8", na.strings=".")
dm <- read.csv(file="COMEPELDA_meps_v1.00.csv",
               header=T, encoding = "UTF-8", na.strings=".")


################################
### Counting Units           ###
################################


nrow(da) # Party-Election-district
nrow(unique(da[,c("ElYear","IDpty")]))

nrow(unique(dc[,c("ElYear","IDpty")])) # (da has 15 2006 cases)

nrow(dm) # MEP-term-episode
nrow(dc) # Candidacies

################################
### Mandate types (Table 2)  ###
################################

table(dm$EP, dm$EpisType)
prop.table(table(dm$EP, dm$EpisType),1)

#######################################################
### Re-nomination and re-election rates (Endnote 4) ###
#######################################################

table(dm$EP, dm$Run01Lead)
prop.table(table(dm$EP, dm$Run01Lead),1)

table(dm$EP, dm$ElectedAny01Lead)
prop.table(table(dm$EP, dm$ElectedAny01Lead),1)

######################
### Map (Figure 1) ###
######################

# ggplot2 map theme adapted from theme by Osmo Saloma
theme_osaad = function(base_size=9, base_family="")
{
  require(grid) # v 4.0.4
  theme_bw(base_size=base_size, base_family=base_family) %+replace%
    theme(axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          axis.text=element_blank(),
          axis.ticks=element_blank(),
          legend.position="right",
          legend.direction="vertical",
          legend.title=element_text(face="bold", hjust=0),
          panel.border=element_rect(fill=NA, colour="black"),
          panel.grid.major=element_blank(),
          panel.grid.minor=element_blank(),
          plot.title=element_text(vjust=1),
          strip.background=element_rect(fill="grey90", colour="black", size=0.3),
          strip.text=element_text()
    )
}

dels <- unique(da[da$ElYear==2014, c("cName", "cBallotType")])

dels$BallotStructure <- 1
dels$BallotStructure[dels$cBallotType=="flexible"] <- 2
dels$BallotStructure[dels$cBallotType=="open"] <- 3
dels$BallotStructure[dels$cBallotType=="STV"] <- 4
dels$BallotStructure <- factor(dels$BallotStructure)
levels(dels$BallotStructure) <- c("closed list", "flexible list", "open list", "STV")

# (the following lines will throw some warnings that appear irrelevant)
sf0 <- get_eurostat_geospatial(output_class = "df", resolution = "60", nuts_level = "0")
sf1 <- get_eurostat_geospatial(output_class = "df", resolution = "60", nuts_level = "1")
sf <- bind_rows(sf0, sf1[sf1$NUTS_NAME=="NORTHERN IRELAND",])

sf$cName <- countrycode(sf$geo, "eurostat", "country.name.en")
sf$cName[sf$cName=="Czechia"] <- "Czech Republic"
sf$cName[sf$cName=="United Kingdom"] <- "Great Britain"
sf$cName[sf$NUTS_NAME == "NORTHERN IRELAND"] <- "Northern Ireland"
sf <- left_join(sf, dels[,c("cName","BallotStructure")]) # 

rawmap <- ggplot(sf) + 
  coord_cartesian(xlim = c(-11, 32), ylim = c(34, 72)) +  
  aes(long, lat, group =group) +
  geom_path(color = "grey") + 
  geom_polygon(fill="white") +
  theme_osaad()

mycolors <- c('#ffffcc','#a1dab4','#41b6c4','#225ea8')
rawmap +
  geom_polygon(data = sf, aes(long, lat, group = group, fill=BallotStructure)) + 
  scale_fill_manual(values=mycolors ,na.translate=FALSE) +
  geom_path(color="grey")  +
  annotate("text",15.5, 35.25, label="Malta, STV", size=3)

ggsave(file="Figure1_map.eps", width=7, height=7) 


##########################################
### Intra-party crowdedness (Figure 2) ###
##########################################

# remove independent candidates and German cases
dag <- da[da$ElYear != 2006 & da$IndepCand01 == 0 & da$cName != "Germany",] 
dag <- dag[dag$pdSeats > 0, ]
summary(dag$pdNofCandEl/dag$pdSeats)

cor(dag$pdNofCandEl,dag$dMag, use="pair") 
cor(dag$pdNofCandEl,dag$dMag, use="pair",method="spearman") 

ggplot(dag, aes(dMag, pdNofCandEl/pdSeats )) +
  geom_count(shape=1) +
  scale_x_log10(breaks=c(1:5,7,10,15,25,40,60,100)) + scale_y_log10() + 
  geom_smooth(colour="grey") + 
  labs(x="District Magnitude", y="Candidates/Seats Ratio") +
  theme_bw() + theme(legend.position = "bottom")

ggsave(file="Figure2_ipcompetition.eps", width=10, height=8, units="cm")


#######################################
### Preference vote use (Figure 3)  ###
#######################################

rm(dag)
dag <- da[da$cPrefVotOblig01 == 0 & !is.na(da$cPrefVotN) & da$cPrefVotN == "1",]
dag$PrefUse <- dag$pdPrefVotSum/dag$pdAbs
summary(dag$PrefUse)
dag <- dag[!is.na(dag$PrefUse),] # drop two Slovakian cases with missings

dag$Year <- dag$ElYear
dag$Year[dag$Year == 2007] <- 2004
dag$Year[dag$Year == 2013] <- 2009

dag <- mutate(group_by(dag, cName, Year), mpu = mean(PrefUse))
dag$fCou <- factor(dag$cName)

summarize(group_by(dag, cName), tmp = mean(mpu))

dag$fCou = factor(dag$fCou,levels(dag$fCou)[c(1,2,7,3,5,4,6)]) # order

dag$Year <- factor(dag$Year)

p <- ggplot(dag, aes(fCou, PrefUse, colour=Year )) +
  geom_point(position=position_dodge(width = .7), shape=1, size=1) +
  geom_point(aes(y=mpu), shape=19,size=2, position=position_dodge(width = .7)) +
  labs(y="Share of party voters casting a preference vote",x="") +
  scale_color_manual(values=c('#d7191c','#fdae61','#abdda4','#2b83ba')) +
  ylim(c(0,1)) +
  coord_flip() + theme_bw() +
  guides(color = guide_legend(reverse = TRUE, title="Year\n(most recent at top)")) +
  theme(legend.position="right", legend.title.align = .5,
        legend.title=element_text(size=10),legend.text=element_text(size=8),
        axis.title=element_text(size=10),
        axis.text=element_text(size=8)) 
p
ggsave(file="Figure3_prefuse.eps", width=12.5, height=10, units="cm")


####################################################
### MEPs elected on pref. votes only (Figure 4)  ###
####################################################

dcg <- merge(dc, unique(da[,c("IDAD","cBallotType","cRankedList01",
                              "pdSeats","pdPrefThrEmp")]), all.x=T)

dcg <- dcg[dcg$cBallotType %in% c("flexible","open") & !is.na(dcg$cRankedList) & dcg$cRankedList==1, ]
dcg <- dcg[dcg$IDpty > -9999,] # remove indep. candidates
dcg <- dcg[!is.na(dcg$ListRankEl),] # remove missings (some Italian cases and Zavada)

# Indicator for 'jumping':
dcg$jump <- ifelse(!is.na(dcg$Elected01) & dcg$Elected01 == 1 &
                     dcg$ListRankEl > dcg$pdSeats,1,0)
dcg$jump[dcg$Elected01==0] <- NA
table(dcg$jump)
# stricter definition of 'jumping':
dcg$jumpstr <- dcg$jump
dcg$jumpstr[dcg$jump == 1 & dcg$ListRankEl <= (dcg$pdSeats+2)] <- 0
table(dcg$jump, dcg$jumpstr)


dj <- dcg[!is.na(dcg$Elected01) & dcg$Elected01==1,]
dj <- arrange(dj,cName,ElYear,-jump )
dj$index <- ave(dj$ListRankEl, dj$cName, dj$ElYear, FUN=seq_along)
dj$fCou <- factor(dj$cName)
dj$PVonly <- factor(dj$jump)
dj$Year <- dj$ElYear
dj$Year[dj$Year == 2007] <- 2004
dj$Year[dj$Year == 2013] <- 2009
#dj$Year <- factor(dj$Year)

tmp <- summarize(group_by(dj, cName), tmp = mean(jump))
tmp$csorter <- rank(tmp$tmp)
#dj$fCou = factor(dj$fCou,levels(dj$fCou)[order(tmp$tmp)]) # order

tmpseats <- unique(dj[,c("IDpty","ElYear","dName","pdSeats")])
table(tmpseats$pdSeats)
cumsum(prop.table(table(tmpseats$pdSeats)))

dj2 <- summarize(group_by(dj, cName, Year),
                 N_1 = sum(jumpstr == 1)/n(),
                 N_2 = sum(jump==1 & jumpstr == 0)/n(),
                 N_3 = sum(jump==0)/n(),
                 total=n())
dj2 <- mutate(group_by(dj2, cName), meanN = mean(total))
dj2 <- merge(dj2, tmp[,c("cName","csorter")])
dj2 <- dj2[order(dj2$csorter, dj2$Year),]
dj2$index <- 1:nrow(dj2)
dj2$index <- dj2$index + dj2$csorter-1
dj2 <- mutate(group_by(dj2, cName), medindex = median(index))

dj2a <- dj2
dj2b <- dj2

dj2$x1 <- 0
dj2$x2 <- dj2$N_1
dj2 <- dj2[order(dj2$cName, dj2$Year),]
dj2$colorindex <-  (dj2$Year - 1994)/5

dj2a$x1 <- dj2a$N_1
dj2a$x2 <- dj2a$N_1 + dj2a$N_2
dj2a <- dj2a[order(dj2a$cName, dj2a$Year),]
dj2a$colorindex <-  (dj2a$Year - 1994)/5 + 4

dj2b$x1 <- dj2b$N_1 + dj2b$N_2
dj2b$x2 <- 1
dj2b$colorindex <- 9


dj2 <- bind_rows(dj2,dj2a, dj2b)

dj2$fYear <- factor(dj2$Year)

mylab <- dj2[, c("Year","cName","index","medindex")]
mylab <- mylab[mylab$Year == 2014,]
mylab$index <- mylab$index + 1

mycolours <- c('#d7191c','#fdae61','#abdda4','#2b83ba')
mycolours <- c(mycolours, lighten(mycolours, amount=.5 ),'#fcfcfc')

ggplot(dj2, aes(y=index, yend=index)) + 
  geom_segment(data=dj2[dj2$colorindex < 5,],
               aes( x=x1, xend=x2, colour=fYear), size=2,
               show.legend=NA) +
  # plot these separately to exclude from legend:
  geom_segment(data=dj2[dj2$colorindex == 5,],
               aes( x=x1, xend=x2), colour=mycolours[5], size=2)  +
  geom_segment(data=dj2[dj2$colorindex == 6,],
               aes( x=x1, xend=x2), colour=mycolours[6], size=2)  +
  geom_segment(data=dj2[dj2$colorindex == 7,],
               aes( x=x1, xend=x2), colour=mycolours[7], size=2)  +
  geom_segment(data=dj2[dj2$colorindex == 8,],
               aes( x=x1, xend=x2), colour=mycolours[8], size=2)  +
  geom_segment(data=dj2[dj2$colorindex == 9,],
               aes( x=x1, xend=x2), colour=mycolours[9], size=2)  +
  geom_point(data=dj2[dj2$Year == 2014,], aes(x=-.07, y=medindex, size=meanN),
             shape=21, color="black" ) +
  scale_x_continuous(limits=c(-.1,NA)) +
  scale_y_continuous(expand=expansion(add=2), breaks=mylab$medindex, labels=mylab$cName) +
  scale_color_manual(values=mycolours, drop=TRUE) +
  scale_size_continuous(breaks=c(6,15,30,75)) +
  labs(x = "Share", y="") +
  guides(size = guide_legend(title="Mean N of MEPs", order=1),
         colour = guide_legend(reverse = TRUE, title="Year\n(most recent at top)",
                              order=2)) +
  theme_bw() +
  theme(legend.position="right", legend.title.align = .5,
        legend.title=element_text(size=10),legend.text=element_text(size=8),
        axis.title=element_text(size=10),
        axis.text=element_text(size=8)) 

ggsave(file="Figure4_MEPsonpv.eps", width=12, height=14, units="cm")


#############################################################
### Predicted probabilities from multin. logit (Figure 5) ###
#############################################################

# the models are estimated in the Stata do-file reselection_regressions.do

d <- read.dta13("reselection_dataforgraph.dta")

summarize(group_by(d, act, terc), test=sum(mean))

d$Tercile <- factor(d$terc)
d$Change <- factor(d$outcome)
levels(d$Change) <- c("Demotion", "None", "Promotion")
ggplot(d, aes(Tercile, mean, colour=Change, shape=Change, fill=Change)) +
  geom_point(position=position_dodge(width=.5)) +
  geom_linerange(aes(ymin = lower, ymax=upper ),
                 position=position_dodge(width=.5)) +
  scale_shape_manual(values=c(25,21,24)) +
  scale_color_manual(values=c("red","grey60","green")) +
  scale_fill_manual(values=c("red","grey60","green")) +
  facet_wrap(~ act) +
  labs(y="Predicted probability") +
  theme_bw() + theme(legend.position = "bottom")

ggsave(file="Figure5_reselection.eps", width=14, height=10, units="cm")

